Data importing (Following Course Book “Assessment Data”)
library(ggplot2)
library(plyr)
library(gdata)
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(stringr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:gdata':
##
## first, last
## Prep Osnabrugge et al.
data = fread("/Users/garamkim/Downloads/dataverse_files/uk_data.csv", encoding="UTF-8")
data$date = as.Date(data$date)
#Create time variable
data$time= NA
data$time[data$date>=as.Date("2001-01-01") & data$date<=as.Date("2001-06-30")] = "01/1"
data$time[data$date>=as.Date("2001-07-01") & data$date<=as.Date("2001-12-31")] = "01/2"
data$time[data$date>=as.Date("2002-01-01") & data$date<=as.Date("2002-06-30")] = "02/1"
data$time[data$date>=as.Date("2002-07-01") & data$date<=as.Date("2002-12-31")] = "02/2"
data$time[data$date>=as.Date("2003-01-01") & data$date<=as.Date("2003-06-30")] = "03/1"
data$time[data$date>=as.Date("2003-07-01") & data$date<=as.Date("2003-12-31")] = "03/2"
data$time[data$date>=as.Date("2004-01-01") & data$date<=as.Date("2004-06-30")] = "04/1"
data$time[data$date>=as.Date("2004-07-01") & data$date<=as.Date("2004-12-31")] = "04/2"
data$time[data$date>=as.Date("2005-01-01") & data$date<=as.Date("2005-06-30")] = "05/1"
data$time[data$date>=as.Date("2005-07-01") & data$date<=as.Date("2005-12-31")] = "05/2"
data$time[data$date>=as.Date("2006-01-01") & data$date<=as.Date("2006-06-30")] = "06/1"
data$time[data$date>=as.Date("2006-07-01") & data$date<=as.Date("2006-12-31")] = "06/2"
data$time[data$date>=as.Date("2007-01-01") & data$date<=as.Date("2007-06-30")] = "07/1"
data$time[data$date>=as.Date("2007-07-01") & data$date<=as.Date("2007-12-31")] = "07/2"
data$time[data$date>=as.Date("2008-01-01") & data$date<=as.Date("2008-06-30")] = "08/1"
data$time[data$date>=as.Date("2008-07-01") & data$date<=as.Date("2008-12-31")] = "08/2"
data$time[data$date>=as.Date("2009-01-01") & data$date<=as.Date("2009-06-30")] = "09/1"
data$time[data$date>=as.Date("2009-07-01") & data$date<=as.Date("2009-12-31")] = "09/2"
data$time[data$date>=as.Date("2010-01-01") & data$date<=as.Date("2010-06-30")] = "10/1"
data$time[data$date>=as.Date("2010-07-01") & data$date<=as.Date("2010-12-31")] = "10/2"
data$time[data$date>=as.Date("2011-01-01") & data$date<=as.Date("2011-06-30")] = "11/1"
data$time[data$date>=as.Date("2011-07-01") & data$date<=as.Date("2011-12-31")] = "11/2"
data$time[data$date>=as.Date("2012-01-01") & data$date<=as.Date("2012-06-30")] = "12/1"
data$time[data$date>=as.Date("2012-07-01") & data$date<=as.Date("2012-12-31")] = "12/2"
data$time[data$date>=as.Date("2013-01-01") & data$date<=as.Date("2013-06-30")] = "13/1"
data$time[data$date>=as.Date("2013-07-01") & data$date<=as.Date("2013-12-31")] = "13/2"
data$time[data$date>=as.Date("2014-01-01") & data$date<=as.Date("2014-06-30")] = "14/1"
data$time[data$date>=as.Date("2014-07-01") & data$date<=as.Date("2014-12-31")] = "14/2"
data$time[data$date>=as.Date("2015-01-01") & data$date<=as.Date("2015-06-30")] = "15/1"
data$time[data$date>=as.Date("2015-07-01") & data$date<=as.Date("2015-12-31")] = "15/2"
data$time[data$date>=as.Date("2016-01-01") & data$date<=as.Date("2016-06-30")] = "16/1"
data$time[data$date>=as.Date("2016-07-01") & data$date<=as.Date("2016-12-31")] = "16/2"
data$time[data$date>=as.Date("2017-01-01") & data$date<=as.Date("2017-06-30")] = "17/1"
data$time[data$date>=as.Date("2017-07-01") & data$date<=as.Date("2017-12-31")] = "17/2"
data$time[data$date>=as.Date("2018-01-01") & data$date<=as.Date("2018-06-30")] = "18/1"
data$time[data$date>=as.Date("2018-07-01") & data$date<=as.Date("2018-12-31")] = "18/2"
data$time[data$date>=as.Date("2019-01-01") & data$date<=as.Date("2019-06-30")] = "19/1"
data$time[data$date>=as.Date("2019-07-01") & data$date<=as.Date("2019-12-31")] = "19/2"
data$time2 = data$time
data$time2 = str_replace(data$time2, "/", "_")
data$stage = 0
data$stage[data$m_questions==1]= 1
data$stage[data$u_questions==1]= 2
data$stage[data$queen_debate_others==1]= 3
data$stage[data$queen_debate_day1==1]= 4
data$stage[data$pm_questions==1]= 5
Inspecting data and selecting some parts of data for the research
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::combine() masks gdata::combine()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first(), gdata::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ dplyr::id() masks plyr::id()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ purrr::keep() masks gdata::keep()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last(), gdata::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ lubridate::second() masks data.table::second()
## ✖ dplyr::starts_with() masks tidyr::starts_with(), gdata::starts_with()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(tidytext)
library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(textdata)
#Filtering data columns
colnames(data)
## [1] "id_speech" "id_mp"
## [3] "period" "last_name"
## [5] "first_name" "date"
## [7] "pm_questions" "queen_debate_day1"
## [9] "queen_debate_others" "m_questions"
## [11] "u_questions" "other_debate"
## [13] "leader" "prime_minister"
## [15] "senior_minister" "shadow"
## [17] "cabinet" "chair"
## [19] "government" "female"
## [21] "age" "electoral_cycle"
## [23] "party" "linear_trend"
## [25] "words" "text"
## [27] "emotive_count" "neutral_count"
## [29] "emotive_rhetoric" "emotive_rhetoric_log"
## [31] "emotive_words" "top_topic"
## [33] "anew_rescaled" "emotive_rhetoric_liwc"
## [35] "positive_count" "negative_count"
## [37] "emotive_positive" "emotive_negative"
## [39] "emotive_count_250_8" "neutral_count_250_8"
## [41] "emotive_rhetoric_250_8" "emotive_count_300_10"
## [43] "neutral_count_300_10" "emotive_rhetoric_300_10"
## [45] "emotive_count_a1" "neutral_count_a1"
## [47] "emotive_rhetoric_a1" "emotive_count_a2"
## [49] "neutral_count_a2" "emotive_rhetoric_a2"
## [51] "time" "time2"
## [53] "stage"
data <- data %>%
select(last_name, first_name, date, female, age, party, text)
head(data)
## last_name first_name date female age party
## <char> <char> <Date> <int> <int> <char>
## 1: dalyell tam 2001-06-13 0 69 Labour
## 2: young george 2001-06-13 0 60 Conservative
## 3: cook robin 2001-06-13 0 55 Labour
## 4: hague william 2001-06-13 0 40 Conservative
## 5: kennedy charles 2001-06-13 0 42 Liberal Democrats
## 6: trimble david 2001-06-13 0 57 Ulster Unionist Party
## text
## <char>
## 1: Under the provisions of Standing Order No 1A I am now required to ascertain whether Mr Michael J Martin is willing to be chosen as Speaker
## 2: I beg to move That Mr Michael J Martin do take the Chair of this House as Speaker Mr Dalyell I begin by congratulating you on your accession as Father of the House Given your commitment to the House and your knowledge of how to use it effectively it gives all of us pleasure to see you supervising our proceedings today I too wish you a full and speedy recovery I also hope that you will continue to be as irreverent to those on your Front Bench as your predecessor as Father of the House was to mine As we meet this afternoon there are two important vacancies in our political institutions and I am happy to make a nomination for one of them In commending the right hon Member for Glasgow Springburn Mr Martin as Speaker I congratulate him on his re election to Parliament Although his election was contested I suspect that he did not experience the aggravation that the rest of us suffered in getting here I mention in passing that we in North West Hampshire saw a new form of co operation between the other two parties the Labour party ran a low key campaign to let the Liberal Democrats have a clear shot at me and the Liberal Democrats ran a low key campaign to give Labour a shot I am happy to say that my majority increased to more than 12 000 There are two reasons why I move the motion the first of which is continuity I do not believe that our Speaker should be put into play at the beginning of every Parliament I can do no better than quote the second report of the Select Committee on Procedure published in February In our view it would be undesirable in these circumstances for a multi candidate ballot to take place automatically As we have seen since the middle of the nineteenth century there has been a strong presumption that a Speaker once elected by the House is not subsequently challenged If it were to become accepted that a change in the composition of the House following a General Election were as a matter of course to lead to a change in the occupancy of the Chair we believe there are grave dangers that the office itself would be destabilised and in danger of becoming politicised Equally however we believe it is important that the House should not be denied the right to change its Speaker however unlikely it may be that that right will be exercised I agree with that sentiment as did the House when it agreed to change our procedures Although the Speaker should be validated there should be a presumption against challenging the incumbent Continuity is not the only reason If we were starting from scratch I believe that the House would choose the right hon Member for Springburn as its Speaker today His long service in the House and deep roots in the Back Benches his work on the Chairmen s Panel and domestic Committees his experience as Deputy Speaker his genial and approachable manner underpinned by a deep affection and commitment to the House all those qualities strike a chord with the House That commitment was confirmed in his acceptance speech last October and reinforced in the statement that we have just heard which was greatly welcomed All Speakers develop their own style and we saw the right hon Member for Springburn develop his in the previous Parliament a more approachable informal style of Speaker mixing with Members in the Tea Room and elsewhere no wig or silk stockings rather like the Scandinavian approach to the monarchy but so far without the bicycle A key role lies ahead for Mr Speaker in this Parliament There is a strong view that the House should reassert the accountability of the Executive to Parliament A growing number of hon Members want the House to be more relevant to the concerns of those whom we represent more effective in what it does and a better link between Government and governed The right hon Member for Springburn is well qualified to supervise and manage that debate during this Parliament I believe that he is entitled to support from the whole House in that role The House will understand why I was unable to propose the right hon Member for Springburn last time but I have no hesitation in proposing him today Question put and agreed to Resolved That Mr Michael J Martin do take the Chair of this House as Speaker Whereupon Mr Tam Dalyell left the Chair and Mr Michael J Martin was taken out of his place and conducted to the Chair by Sir George Young and Ann Keen
## 3: Mr Speaker Elect it is a great pleasure to congratulate you on your return to the Chair of the House I think I speak for all Members when I say it is also a great pleasure for all of us that we have completed your election for a second time in rather shorter order than the six hours that it took us the first time May I add my congratulations to my neighbour my hon Friend the Member for Linlithgow Mr Dalyell on becoming Father of the House No hon Member has shown greater affection for the traditions of the House or carried out his duties as a Member with greater diligence than my hon Friend As a fellow West Lothian Member of Parliament perhaps I can explain on his behalf to the House that his injury is a tribute to the diligence with which he pursues his constituency office as it was incurred with an excess of enthusiasm at a meeting of the Linlithgow football team It therefore was an asset not a liability in the recent election Mr Speaker Elect there have been only 155 Speakers before you in the many centuries of your office When Speaker Yelverton in 1597 was asked to describe the necessary qualities of a Speaker he replied A man big and comely his carriage majestical his nature haughty and his purse plentiful Fortunately the characteristics of the Speaker have changed over the subsequent four centuries For myself I am relieved that we no longer expect our Speaker to be haughty just as you must be relieved that the House does not expect your purse to be plentiful Over the past seven months you have shown all the necessary qualities of a modern Speaker We could have expected no less from a Speaker who has brought to the Chair of the Chamber one of the longest records of service on the Chairmen s Panel You have been fair but firm you have turned aside confrontation with humour and you have got the better of those unwise enough to challenge your authority Many an awkward moment has been defused with your trademark catch phrase It s no nice Since you came to office your voice has become familiar in households across our country You spoke for all your Scottish compatriots in the Chamber when you magnificently brushed aside an impertinent question from the BBC on your accent with the retort I don t have an accent other people have an accent Yours is of course an accent which would normally lend authority to one of the traditional roles of the Speaker the selection of a Scotch whisky for the Speaker s brand For you though as a teetotaller that selection presented some obvious difficulties Your solution delighted the Members whom you invited on to an all party committee to carry out extensive research and tasting on your behalf Not always believing that we get the press that we deserve is of course a frequent experience of Members of this House even if the rest of the country thinks that we get the press that we richly deserve If ever in this Parliament the Lobby dares hint at criticism of our Speaker it can only be because as Chairman of the Administration Committee you banned Lobby correspondents from the Terrace of the Commons unless personally supervised You thereby liberated a grateful House to relax in privacy on the Terrace in these summer months Both you and I are now of that age when we share that puzzling sensation after each election that Members of Parliament seem to be getting younger than we remember To an even younger Chamber than before your commitment that families will be remembered in the proceedings of the House is particularly welcome That commitment comes of course from the affection and importance that you attach to your own family Those who know you well know also that no words of congratulations to you would be complete unless they also expressed appreciation of Mary your wife for her support to you in your role here and in your constituency Mr Speaker Elect on first taking the Chair you said that the Speaker had a clear duty to every side of the House especially to the Back Benchers It is because you have served the whole House with impartiality that the whole House has today returned you to the Chair unanimously Your colleagues and your friends congratulate you on it
## 4: On behalf of all my right hon and hon Friends on the Opposition Benches I offer our sincerest congratulations to you Mr Speaker Elect on your re election Many colleagues on both sides of the House will share my relief that it was accompanied by less controversy and took dramatically less time than your election in October Your unanimous re election underlines the important constitutional fact that you are now very much Speaker of the whole of the House of Commons I add the Opposition s congratulations to those expressed to the Father of the House the hon Member for Linlithgow Mr Dalyell who as a highly active Back Bencher and champion of so many causes over decades is in every sense fully qualified for the post It is also a great pleasure to be the first to congratulate the right hon Member for Livingston Mr Cook on his appointment as Leader of the House of Commons We learned from the newspapers that apparently he has been unable to express all his real views as Foreign Secretary over the past four years My hon Friends are much looking forward to asking him for all his real views as Leader of the House during a succession of Question Times and statements over coming months On these occasions Mr Speaker Elect a great deal is usually made of the distinctive parliamentary existence of the holder of your great office that results from the need to cut yourself off from previous party affiliations You have indeed cut yourself off from party affiliations and as has been mentioned served this House impartially It has always been a tradition of Speakers that they do not visit the Tea Room and the bars as part of that procedure You have created a new tradition by being available and visiting the Tea Room and bars but not having a drink there That may keep you happy although it is incomprehensible to the rest of us but we are delighted that you have amended the traditions in that respect On a more serious point you are the custodian of the rules privileges and traditions of this House As the many newly elected Members will soon learn we all look to you as the independent champion of all parties in the House and of the rights of all hon Members As you know I have always made a particular point about protecting the rights of the Opposition Front Bench come to think of it though the rights of Back Benchers need protecting too especially those who have not spoken from the Back Benches for a long time I know that you will protect their rights in this Parliament There are also those occasions of high drama Mr Speaker Elect when votes are tied and it falls to you to use your casting vote although I admit that it is not immediately obvious that that will be necessary in a large number of instances You never know we will be working on it from this side of the House I hope that the fact that the Government have such a majority will not deter hon Members on both sides of the House from doing the job that they have been sent here to do to hold the Government to account Like so many past and present Members you and I care passionately about the House of Commons and the standing in which it is held throughout the country That is why I for one deeply regret the diminution of its importance and reputation which has accelerated although it did not begin in recent years I cannot be alone in thinking that that decline contributed in some ways to the disconnection between the public and Parliament that was highlighted by the lamentably low voter turnout in the general election last week Last October I expressed the hope that you would robustly resist all attempts to downgrade marginalise or bypass the House of Commons I repeat that today Few things would give me and I hope hon Members from all parties greater satisfaction in politics than to see this House restored to the centre of our national life The great issues that will come before us during the next few years some of which may be of supreme importance to the way in which our country is governed should be fully scrutinised and debated first and foremost in this Chamber On a personal note just as you are about to resume your duties I am planning to relinquish mine as Leader of the Opposition although I shall carry them out for a few weeks more I should like to thank you for the courtesy and kindness that you have shown me behind the scenes since you took the Chair of this House last October I am sure that your advice will continue to be of enormous benefit to hon Members throughout the House Again Mr Speaker Elect I congratulate you and wish you well for what undoubtedly promises to be an important and demanding period in our parliamentary history
## 5: Mr Speaker Elect may I entirely associate my right hon and hon Friends with the personal and unanimous congratulations that the House has recorded on your re election as Speaker this afternoon We wish you all possible success for the forthcoming Parliament You will remember something about which we have joked privately since your first election At that time I recalled our first encounter when I was a schoolboy and you were on a picket line in your then capacity as a NUPE National Union of Public Employees regional official The unanimous endorsement of the House must mean that we have witnessed the ultimate triumph of NUPE man this afternoon We congratulate you on that I should like to take this opportunity also to congratulate the new Father of the House whose courtesy and advice to all of us especially those who have entered the House over the years as new Members has been unstinting and much appreciated It is a great happiness to see him occupy that position on behalf of us all but for me that happiness is tinged with one wee regret I shall dearly miss his distinguished predecessor Sir Edward Heath at Prime Minister s questions I do not have any ambition as Lib Dem leader to write a diary and publish it The book that I want to write and publish one day is the off the record commentary of Sir Edward Heath between 3 o clock and 3 30 every Wednesday afternoon on the parliamentary Conservative party and the questions that its members put to the Prime Minister of the day I shall miss the sensation when Sir Edward was notably upset or distressed by the viewpoints given by one of his own He had a tendency to expel a great sigh of frustration which tended to move me significantly along the Bench I shall miss those noises We must all share a sense of disappointment if not foreboding about the lack of engagement of so many of our fellow citizens in the election That being so I hope that early steps will be taken to examine our procedures and practices in this place to ensure that they are as relevant and comprehensible as possible to those outside There is a danger that all of us in all parties will be engulfed if we are not careful Given the balance of the outcome of the election I hope that you Mr Speaker Elect your senior officials and the new Leader of the House will give fresh consideration to the procedures of the House to ensure that they are made more flexible in some ways the House of Lords may provide instruction for us with regard to the rights of all parties especially those in opposition We look forward to contributing constructively to such discussions at the earliest opportunity In the meantime we wish you well Sir
## 6: Mr Speaker Elect on my behalf and that of my colleagues I offer sincere congratulations on your being re elected as Speaker of this place We can vouch for your approachability the care with which you have listened to the issues that Members have brought to you and the way in which you have responded to them We look forward with confidence to that same care and consideration being offered in the months and years to come I congratulate the Father of the House on his conduct of today s proceedings It is not necessary for me to emphasise the way in which he has built a reputation over the years as a Back Bencher who has held Governments of all colours to account That reminds us that that is the prime function that many of us have in the House I am sure Mr Speaker Elect that with your consideration for Back Benchers and for the primary role of the House you will always be ready to assist us in ensuring that the Government are held to account It is commonplace these days to regret the decline of the standing of the House but we would all do well to remember that its standing depends upon each and every one of us in the contribution that we make to the House I am sure that we shall all be conscious of that responsibility However we rely upon you Mr Speaker Elect when it comes to issues where government may perhaps wish to cut corners or to do things elsewhere to insist upon the primacy of the House I shall make a cautionary comment on the remarks of the right hon Member for Ross Skye and Inverness West Mr Kennedy the leader of the Liberal Democrats We want to see the procedures of the House become more effective and clearly and easily understood However virtually every time that the reform of our procedures is broached the result is that the life of government is made easier That is not our objective
Defining research data by filtering research words
# Filtering data containing immigration related words
# Define the keywords to search for
immig_words <- c('immigration', 'immigrant', 'asylum')
visa_words <- "\\b(UK)?visas?\\b"
all_words <- paste0(c(paste0(immig_words, collapse = "|"), visa_words), collapse = "|")
# lower case text with keywords
tidy_data_notoken <- data %>%
mutate(desc = tolower(text)) %>%
filter(grepl(all_words, desc))
Making ‘Year’ column
# Make 'Year' using 'date'
tidy_data_notoken$Year <- format(as.Date(tidy_data_notoken$date), "%Y")
data$Year <- format(as.Date(data$date), "%Y")
# Count the number of text by year and party
text_count <- tidy_data_notoken %>%
group_by(Year, party) %>%
summarise(n = n(), .groups = 'drop')
text_count$Year <- as.numeric(as.character(text_count$Year)) # Make 'Year' Column of numeric character.
A Simple descriptive plot - The number of immigration related texts by party
# Plotting counts of immigration texts by party
ggplot(text_count, aes(x = Year, y = n, group = party, color = party)) +
geom_line() +
labs(y = "Immigration texts num", x = "Year") +
scale_x_continuous(breaks = seq(min(text_count$Year), max(text_count$Year), by = 4))
theme_minimal()
## List of 136
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ title : NULL
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.75points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.bottom : NULL
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.75points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.left : NULL
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.75points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.2points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.bottom : NULL
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.left : NULL
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.theta : NULL
## $ axis.text.r :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.position : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.position : NULL
## $ legend.position : chr "right"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## $ legend.location : NULL
## $ legend.box : NULL
## $ legend.box.just : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "unit")= int 1
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## [list output truncated]
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
Dividing the number of immigration texts by the total number of texts
# Count the total number of texts in data by year and party
data_count <- data %>%
group_by(Year, party) %>%
summarise(total_n = n(), .groups = 'drop')
# Merge the counts and calculate the ratio
ratio_count <- merge(text_count, data_count, by = c("Year", "party"))
ratio_count$ratio <- with(ratio_count, n / total_n)
# Only counts more than 100 texts
filtered_ratio_count <- ratio_count %>%
filter(n > 100)
filtered_ratio_count$Year <- as.numeric(as.character(filtered_ratio_count$Year))
A ratio plot of a number of immigration related texts by total texts
#Plotting the ratios
ggplot(filtered_ratio_count, aes(x = Year, y = ratio, group = party, color = party)) +
geom_line() +
labs(y = "% Immigration Texts (n>100)", x = "Year") +
scale_y_continuous(labels = scales::percent_format(), expand = c(0, 0), limits = c(0, NA)) +
scale_x_continuous(breaks = seq(min(filtered_ratio_count$Year), max(filtered_ratio_count$Year), by = 4)) +
theme_minimal()
Tokenisation and processing stop words
# Tokenisation & removing stop words
tidy_data <- tidy_data_notoken %>%
unnest_tokens(word, desc) %>%
filter(str_detect(word, "[a-z]")) %>%
filter(!word %in% stop_words$word)
tidy_data <- tidy_data %>%
arrange(date)
tidy_data$order <- 1:nrow(tidy_data) # Make orders of each word
# Common tokens
word_count <- tidy_data %>%
count(word, sort = T)
show(word_count)
## word n
## <char> <int>
## 1: people 27644
## 2: hon 26615
## 3: government 26551
## 4: immigration 19105
## 5: minister 14229
## ---
## 39501: énarques 1
## 39502: ørsted 1
## 39503: þæt 1
## 39504: šefcovic 1
## 39505: štefan 1
# Common tokens by year
word_count_year <- tidy_data %>%
group_by(Year) %>%
count(word, sort = T)
show(word_count_year)
## # A tibble: 226,043 × 3
## # Groups: Year [19]
## Year word n
## <chr> <chr> <int>
## 1 2018 government 2122
## 2 2018 people 2060
## 3 2015 people 2058
## 4 2018 immigration 2046
## 5 2004 government 2007
## 6 2002 people 1883
## 7 2018 hon 1851
## 8 2004 hon 1826
## 9 2002 hon 1792
## 10 2007 hon 1792
## # ℹ 226,033 more rows
# Comon tokens by party
word_count_party <- tidy_data %>%
group_by(party) %>%
count(word, sort = T)
show(word_count_party)
## # A tibble: 98,986 × 3
## # Groups: party [9]
## party word n
## <chr> <chr> <int>
## 1 Labour people 12163
## 2 Conservative government 12117
## 3 Labour hon 12010
## 4 Conservative hon 11848
## 5 Conservative people 11229
## 6 Labour government 9743
## 7 Conservative immigration 9001
## 8 Labour immigration 7336
## 9 Conservative minister 6290
## 10 Conservative country 6258
## # ℹ 98,976 more rows
Calculating NRC sentiment negative ratios of texts which contain immigration related words by party
# Arranging data by party and calculate the ratio of negative sentiment.
nrc_data_party <- tidy_data %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
group_by(party, date) %>%
count(sentiment) %>%
spread(key = sentiment, value = n, fill = 0) %>%
mutate(ratio = negative/(positive+1)) %>%
ungroup()
## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 5657 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
filtered_nrc_data_party <-nrc_data_party %>%
group_by(party) %>%
filter(n() > 100) %>% # party with more than 100 related words
ungroup()
Plotting a negative ratio plot of party. Points indicate the actual scores while smooth lines show the trends.
# A negative sentiment ratio plot of four parties
filtered_nrc_data_party %>%
ggplot(aes(date, ratio, color = party)) +
geom_point(alpha=0.5, size = 1) +
geom_smooth(method="loess", se = F, alpha=0.7, size = 1) +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
labs(y = "NRC Sentiment Negative Ratio by Party", x = "Date") +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col="red", size = 1) +
annotate("text", x = as.Date("2007-09-24"), y = 4, label="Northern Rock\nLiquidity Support", angle=90, color = "black", size = 4) +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
Make the plot above readable, dividing plots by party
# Faceting by party
filtered_nrc_data_party %>%
ggplot(aes(x = date, y = ratio, color = party)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "loess", se = FALSE, alpha = 0.7, size = 1) +
labs(y = "NRC Sentiment Negative Ratio", x = "Date") +
facet_wrap(~party, scales = "free_y") +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col = "red", size = 1) +
theme_minimal() +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
What is the most common sentiment? Summing up scores of sentiments by party to compare.
# Total sentiment word counts by party
dominant_senti_nrc_party <-filtered_nrc_data_party %>%
group_by(party) %>%
summarise(across(c(anger, anticipation, disgust, fear, joy, sadness, surprise, trust, negative, positive), sum, na.rm = T))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## ℹ In group 1: `party = "Conservative"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
show(dominant_senti_nrc_party)
## # A tibble: 4 × 11
## party anger anticipation disgust fear joy sadness surprise trust negative
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Conser… 33439 52127 17861 56500 36474 32322 17703 99258 84545
## 2 Labour 35105 53846 18420 57405 38456 34318 18408 97785 85101
## 3 Libera… 5625 9303 2963 9620 5875 5778 3149 15899 14675
## 4 Scotti… 4637 7398 2523 8180 4986 5388 2638 12649 12877
## # ℹ 1 more variable: positive <dbl>
Beyond negative/positive ratio, focusing on specific sentiments trends by party.
# Combining all sentiments into one column
long_nrc_data <- filtered_nrc_data_party %>%
pivot_longer(cols = c(anger, fear, trust, sadness, disgust, anticipation, surprise, joy), names_to = "sentiment", values_to = "score")
Faceting plots to compare the sentiment changes of each party
# Plotting specific sentiment changes by party
long_nrc_data %>%
ggplot(aes(x = date, y = score, color = sentiment)) +
geom_smooth(method = "loess", se = F, alpha = 1, size = 1) +
geom_point(alpha = 0.5, size = 0.5) +
facet_wrap(~ party, scales = "free_y") +
labs(x = "Date", y = "Score") +
theme_minimal() +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col = "red") +
theme_minimal() +
theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
To make sentiment points recognizable, adjusting the shapes of points by sentiment and plotting
# Shaping sentiment points differently
shape_mapping <- c("anger" = 17, "disgust" = 18, "fear" = 19, "sadness" = 8, "trust" = 21, "anticipation" = 22, "surprise" = 23, "joy" = 24)
long_nrc_data %>%
ggplot(aes(x = date, y = score, color = sentiment)) +
geom_smooth(method = "loess", se = F, alpha = 0.75, size = 0.75) +
geom_point(aes(shape = sentiment), alpha = 0.5, size = 0.5) +
facet_wrap(~ party, scales = "free_y", nrow = 2) +
labs(x = "Date", y = "Score") +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col = "red") +
theme_minimal() +
theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 8132 rows containing missing values or values outside the scale range
## (`geom_point()`).
Selecting negative sentiments to confirm sentiment changes
# Specifying negative feelings only
long_nrc_data_neg <- filtered_nrc_data_party %>%
pivot_longer(cols = c(anger, fear, sadness, disgust), names_to = "sentiment_negative", values_to = "score")
# Plots of specific negative sentiments by party
long_nrc_data_neg %>%
ggplot(aes(x = date, y = score, color = sentiment_negative)) +
geom_smooth(method = "loess", se = F, alpha = 0.75, size = 1) +
geom_point(aes(shape = sentiment_negative), alpha = 0.5, size = 1) +
facet_wrap(~ party, scales = "free_y", nrow = 2) +
labs(x = "Date", y = "Score") +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col = "red") +
theme_minimal() +
theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
To compare with sentiment trends of immigration related text, make a random sample of 10000 and do sentiment analysis
# Random sampling
data_sample <- data %>%
sample_n(10000)
# tidy sample
tidy_samp <- data_sample %>%
mutate(desc = tolower(text)) %>%
unnest_tokens(word, desc) %>%
filter(str_detect(word, "[a-z]")) %>%
filter(!word %in% stop_words$word) %>%
arrange(date)
tidy_samp$order <- 1:nrow(tidy_samp)
# Applying NRC dictionary and calculating total negative ratio
samp_nrc_data <- tidy_samp %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
count(date, index = order %/% 1000, sentiment) %>%
spread(key = sentiment, value = n, fill = 0) %>%
mutate(ratio = negative / (positive+1))
## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 5541 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#Applying NRC dictionary and calculating negative ratio by party
samp_nrc_party <- tidy_samp %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
group_by(party, date) %>%
count(sentiment) %>%
spread(key = sentiment, value = n, fill = 0) %>%
mutate(ratio = negative / (positive+1)) %>%
ungroup()
## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 5541 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
filtered_samp_nrc_party <- samp_nrc_party %>%
group_by(party) %>%
filter(n() > 100) %>%
ungroup()
A plot of sample text negative sentiment ratio by party
filtered_samp_nrc_party %>%
ggplot(aes(date, ratio, color = party)) +
geom_point(alpha=0.25) +
geom_smooth(method="loess", se = F, alpha=0.25, size = 1.5) +
labs(y = "Sample NRC sentiment negative ratio by party", x = "Date") +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col = "red", size = 1) +
annotate("text", x = as.Date("2007-09-24"), y = 5, label="Northern Rock\nLiquidity Support", angle=90, color = "black", size = 4)
## `geom_smooth()` using formula = 'y ~ x'
theme_minimal() +
theme(legend.position = "bottom")
## List of 136
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ title : NULL
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.75points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.bottom : NULL
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.75points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.left : NULL
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.75points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.2points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.bottom : NULL
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.left : NULL
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.theta : NULL
## $ axis.text.r :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.position : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.position : NULL
## $ legend.position : chr "bottom"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## $ legend.location : NULL
## $ legend.box : NULL
## $ legend.box.just : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "unit")= int 1
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## [list output truncated]
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
Dividing the sample sentiment plot by party
# Faceting plots by party
filtered_samp_nrc_party %>%
ggplot(aes(date, ratio, color = party)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = F, alpha = 0.7, size = 1) +
labs(y = "Sample NRC Sentiment Negative Ratio", x = "Date") +
facet_wrap(~party, scales = "free_y") +
geom_vline(xintercept = as.numeric(as.Date("2007-09-14")), col = "red", size = 1) +
theme_minimal() +
theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
# Set up
library(text2vec)
library(stringr)
library(umap)
library(ggrepel)
To make four different GloVe embedding models, dividing data by party and date
library(RcppParallel)
# filter data by party and date. Now have pre/post economic crisis text data of Conservative and Labour party
data_conser_pre <- filter(tidy_data_notoken, party == "Conservative" & date < as.Date("2007-09-24"))
data_labour_pre <- filter(tidy_data_notoken, party == "Labour" & date < as.Date("2007-09-24"))
data_conser_post <- filter(tidy_data_notoken, party == "Conservative" & date >= as.Date("2007-09-24"))
data_labour_post <- filter(tidy_data_notoken, party == "Labour" & date >= as.Date("2007-09-24"))
Training process
set.seed(42L)
glove_text_conser_pre <- sample(data_conser_pre$desc)
tokens_conser_pre <- space_tokenizer(glove_text_conser_pre)
it_conser_pre <- itoken(tokens_conser_pre, progressbar = FALSE)
vocab_conser_pre <- create_vocabulary(it_conser_pre)
vocab_pruned_conser_pre <- prune_vocabulary(vocab_conser_pre, term_count_min = COUNT_MIN)
vectorizer_conser_pre <- vocab_vectorizer(vocab_pruned_conser_pre)
tcm_conser_pre <- create_tcm(it_conser_pre, vectorizer_conser_pre, skip_grams_window = WINDOW_SIZE, skip_grams_window_context = "symmetric", weights = rep(1, WINDOW_SIZE))
glove_conser_pre <- GlobalVectors$new(rank = DIM, x_max = 100, learning_rate = 0.05)
word_vectors_main_conser_pre <- glove_conser_pre$fit_transform(tcm_conser_pre, n_iter = ITERS, convergence_tol = 0.001, n_threads = RcppParallel::defaultNumThreads())
## INFO [22:54:28.379] epoch 1, loss 0.1746
## INFO [22:54:30.812] epoch 2, loss 0.0753
## INFO [22:54:33.163] epoch 3, loss 0.0537
## INFO [22:54:35.501] epoch 4, loss 0.0430
## INFO [22:54:37.861] epoch 5, loss 0.0363
## INFO [22:54:40.215] epoch 6, loss 0.0317
## INFO [22:54:42.562] epoch 7, loss 0.0282
## INFO [22:54:44.905] epoch 8, loss 0.0255
## INFO [22:54:47.286] epoch 9, loss 0.0234
## INFO [22:54:49.654] epoch 10, loss 0.0216
## INFO [22:54:51.993] epoch 11, loss 0.0202
## INFO [22:54:54.360] epoch 12, loss 0.0190
## INFO [22:54:56.711] epoch 13, loss 0.0179
## INFO [22:54:59.067] epoch 14, loss 0.0170
## INFO [22:55:01.456] epoch 15, loss 0.0162
## INFO [22:55:03.816] epoch 16, loss 0.0154
## INFO [22:55:06.183] epoch 17, loss 0.0148
## INFO [22:55:08.577] epoch 18, loss 0.0142
## INFO [22:55:10.939] epoch 19, loss 0.0137
## INFO [22:55:13.310] epoch 20, loss 0.0132
## INFO [22:55:15.694] epoch 21, loss 0.0128
## INFO [22:55:18.055] epoch 22, loss 0.0124
## INFO [22:55:20.467] epoch 23, loss 0.0120
## INFO [22:55:22.882] epoch 24, loss 0.0117
## INFO [22:55:25.234] epoch 25, loss 0.0113
## INFO [22:55:27.613] epoch 26, loss 0.0110
## INFO [22:55:29.952] epoch 27, loss 0.0108
## INFO [22:55:32.294] epoch 28, loss 0.0105
## INFO [22:55:34.654] epoch 29, loss 0.0102
## INFO [22:55:37.036] epoch 30, loss 0.0100
## INFO [22:55:39.400] epoch 31, loss 0.0098
## INFO [22:55:41.765] epoch 32, loss 0.0096
## INFO [22:55:44.088] epoch 33, loss 0.0094
## INFO [22:55:46.442] epoch 34, loss 0.0092
## INFO [22:55:48.824] epoch 35, loss 0.0090
## INFO [22:55:51.172] epoch 36, loss 0.0088
## INFO [22:55:53.536] epoch 37, loss 0.0087
## INFO [22:55:55.920] epoch 38, loss 0.0085
## INFO [22:55:58.271] epoch 39, loss 0.0083
## INFO [22:56:00.648] epoch 40, loss 0.0082
## INFO [22:56:03.007] epoch 41, loss 0.0080
## INFO [22:56:05.381] epoch 42, loss 0.0079
## INFO [22:56:07.881] epoch 43, loss 0.0078
## INFO [22:56:10.254] epoch 44, loss 0.0076
## INFO [22:56:12.609] epoch 45, loss 0.0075
## INFO [22:56:14.987] epoch 46, loss 0.0074
## INFO [22:56:17.374] epoch 47, loss 0.0073
## INFO [22:56:19.715] epoch 48, loss 0.0072
## INFO [22:56:22.058] epoch 49, loss 0.0071
## INFO [22:56:24.430] epoch 50, loss 0.0070
## INFO [22:56:26.801] epoch 51, loss 0.0069
## INFO [22:56:29.172] epoch 52, loss 0.0068
## INFO [22:56:31.499] epoch 53, loss 0.0067
## INFO [22:56:33.857] epoch 54, loss 0.0066
## INFO [22:56:36.201] epoch 55, loss 0.0065
## INFO [22:56:38.546] epoch 56, loss 0.0064
## INFO [22:56:40.887] epoch 57, loss 0.0063
## INFO [22:56:43.388] epoch 58, loss 0.0062
## INFO [22:56:45.789] epoch 59, loss 0.0061
## INFO [22:56:48.146] epoch 60, loss 0.0061
## INFO [22:56:50.484] epoch 61, loss 0.0060
## INFO [22:56:52.837] epoch 62, loss 0.0059
## INFO [22:56:55.176] epoch 63, loss 0.0058
## INFO [22:56:57.544] epoch 64, loss 0.0058
## INFO [22:56:59.880] epoch 65, loss 0.0057
## INFO [22:57:02.276] epoch 66, loss 0.0056
## INFO [22:57:04.638] epoch 67, loss 0.0056
## INFO [22:57:07.023] epoch 68, loss 0.0055
## INFO [22:57:09.357] epoch 69, loss 0.0054
## INFO [22:57:11.750] epoch 70, loss 0.0054
## INFO [22:57:14.111] epoch 71, loss 0.0053
## INFO [22:57:16.500] epoch 72, loss 0.0052
## INFO [22:57:18.860] epoch 73, loss 0.0052
## INFO [22:57:21.250] epoch 74, loss 0.0051
## INFO [22:57:23.636] epoch 75, loss 0.0051
## INFO [22:57:26.008] epoch 76, loss 0.0050
## INFO [22:57:28.379] epoch 77, loss 0.0049
## INFO [22:57:30.720] epoch 78, loss 0.0049
## INFO [22:57:33.072] epoch 79, loss 0.0048
## INFO [22:57:35.442] epoch 80, loss 0.0048
## INFO [22:57:37.809] epoch 81, loss 0.0047
## INFO [22:57:40.171] epoch 82, loss 0.0047
## INFO [22:57:42.536] epoch 83, loss 0.0046
## INFO [22:57:44.876] epoch 84, loss 0.0046
## INFO [22:57:47.235] epoch 85, loss 0.0045
## INFO [22:57:49.565] epoch 86, loss 0.0045
## INFO [22:57:51.900] epoch 87, loss 0.0045
## INFO [22:57:54.290] epoch 88, loss 0.0044
## INFO [22:57:56.787] epoch 89, loss 0.0044
## INFO [22:57:59.242] epoch 90, loss 0.0043
## INFO [22:58:01.651] epoch 91, loss 0.0043
## INFO [22:58:04.018] epoch 92, loss 0.0042
## INFO [22:58:06.374] epoch 93, loss 0.0042
## INFO [22:58:08.708] epoch 94, loss 0.0042
## INFO [22:58:11.103] epoch 95, loss 0.0041
## INFO [22:58:13.470] epoch 96, loss 0.0041
## INFO [22:58:15.841] epoch 97, loss 0.0040
## INFO [22:58:18.210] epoch 98, loss 0.0040
## INFO [22:58:20.542] epoch 99, loss 0.0040
## INFO [22:58:22.870] epoch 100, loss 0.0039
word_vectors_context_conser_pre <- glove_conser_pre$components
glove_embedding_conser_pre <- word_vectors_main_conser_pre + t(word_vectors_context_conser_pre)
saveRDS(glove_embedding_conser_pre, file = "local_glove_conser_pre.rds")
Modelling takes time. So I will use the prepared model made by same process for knitting.
url_conser_pre <- "https://github.com/RiverKim-garam/CTA24-Final-assessment/blob/main/local_glove_conser_pre.rds?raw=true"
golve_embedding_conser_pre <- readRDS(url(url_conser_pre, method = "libcurl"))
Total GloVe word embedding two dimensional umap
# Plotting the whole word embeddings of pre-crisis Conservative party immigration related text
umap_conser_pre <- umap(glove_embedding_conser_pre, n_components = 2, metric = "cosine", n_neighbors = 25, min_dist = 0.1, spread = 2)
df_umap_conser_pre <- as.data.frame(umap_conser_pre[["layout"]])
df_umap_conser_pre$word <- rownames(df_umap_conser_pre)
colnames(df_umap_conser_pre) <- c("Pre_Cons1", "Pre_Cons2", "word")
ggplot(df_umap_conser_pre) +
geom_point(aes(x = Pre_Cons1, y = Pre_Cons2), color = 'blue', size = 0.05) +
labs(title = "Conservative Party (Pre-Crisis): Word Embeddings of GloVe and UMAP") +
theme_minimal()
Pre-crisis Conservative party word embedding of words related to immigration
# Plot the word embedding of words that are related for the GloVe model (Case1: immigration)
word_conser_pre_1 <- glove_embedding_conser_pre["immigration",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_pre, y = word_conser_pre_1, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_pre_1 <- df_umap_conser_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_pre_1, aes(x = Pre_Cons1, y = Pre_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Cons1, Pre_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Pre-Crisis) word embedding of 'immigration'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Conservative party word embedding of words related to immigrants
# Plot the word embedding of words that are related for the GloVe model (Case2: immigrants)
word_conser_pre_2 <- glove_embedding_conser_pre["immigrants",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_pre, y = word_conser_pre_2, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_pre_2 <- df_umap_conser_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_pre_2, aes(x = Pre_Cons1, y = Pre_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Cons1, Pre_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Pre-Crisis) word embedding of 'immigrants'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Conservative party word embedding of words related to asylums
# Plot the word embedding of words that are related for the GloVe model (Case3: asylums)
word_conser_pre_3 <- glove_embedding_conser_pre["asylums",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_pre, y = word_conser_pre_3, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_pre_3 <- df_umap_conser_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_pre_3, aes(x = Pre_Cons1, y = Pre_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Cons1, Pre_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Pre-Crisis) word embedding of 'asylums'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Conservative party word embedding of words related to asylum
# Plot the word embedding of words that are related for the GloVe model (asylum)
word_conser_pre_4 <- glove_embedding_conser_pre["asylum",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_pre, y = word_conser_pre_4, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_pre_4 <- df_umap_conser_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_pre_4, aes(x = Pre_Cons1, y = Pre_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Cons1, Pre_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Pre-Crisis) word embedding of 'asylum'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Training process
set.seed(42L)
glove_text_conser_post <- sample(data_conser_post$desc)
tokens_conser_post <- space_tokenizer(glove_text_conser_post)
it_conser_post <- itoken(tokens_conser_post, progressbar = FALSE)
vocab_conser_post <- create_vocabulary(it_conser_post)
vocab_pruned_conser_post <- prune_vocabulary(vocab_conser_post, term_count_min = COUNT_MIN)
vectorizer_conser_post <- vocab_vectorizer(vocab_pruned_conser_post)
tcm_conser_post <- create_tcm(it_conser_post, vectorizer_conser_post, skip_grams_window = WINDOW_SIZE, skip_grams_window_context = "symmetric", weights = rep(1, WINDOW_SIZE))
glove_conser_post <- GlobalVectors$new(rank = DIM, x_max = 100, learning_rate = 0.05)
word_vectors_main_conser_post <- glove_conser_post$fit_transform(tcm_conser_post, n_iter = ITERS, convergence_tol = 0.001, n_threads = RcppParallel::defaultNumThreads())
## INFO [22:59:14.384] epoch 1, loss 0.1729
## INFO [22:59:19.427] epoch 2, loss 0.0745
## INFO [22:59:24.462] epoch 3, loss 0.0544
## INFO [22:59:29.546] epoch 4, loss 0.0443
## INFO [22:59:34.586] epoch 5, loss 0.0380
## INFO [22:59:39.621] epoch 6, loss 0.0336
## INFO [22:59:44.914] epoch 7, loss 0.0303
## INFO [22:59:49.990] epoch 8, loss 0.0278
## INFO [22:59:55.068] epoch 9, loss 0.0258
## INFO [23:00:00.158] epoch 10, loss 0.0241
## INFO [23:00:05.237] epoch 11, loss 0.0227
## INFO [23:00:10.347] epoch 12, loss 0.0215
## INFO [23:00:15.405] epoch 13, loss 0.0205
## INFO [23:00:20.470] epoch 14, loss 0.0196
## INFO [23:00:25.518] epoch 15, loss 0.0188
## INFO [23:00:30.588] epoch 16, loss 0.0181
## INFO [23:00:35.664] epoch 17, loss 0.0175
## INFO [23:00:40.749] epoch 18, loss 0.0169
## INFO [23:00:45.837] epoch 19, loss 0.0163
## INFO [23:00:50.971] epoch 20, loss 0.0159
## INFO [23:00:56.068] epoch 21, loss 0.0154
## INFO [23:01:01.207] epoch 22, loss 0.0150
## INFO [23:01:06.641] epoch 23, loss 0.0146
## INFO [23:01:11.814] epoch 24, loss 0.0143
## INFO [23:01:16.931] epoch 25, loss 0.0139
## INFO [23:01:21.994] epoch 26, loss 0.0136
## INFO [23:01:27.079] epoch 27, loss 0.0133
## INFO [23:01:32.238] epoch 28, loss 0.0131
## INFO [23:01:37.365] epoch 29, loss 0.0128
## INFO [23:01:42.457] epoch 30, loss 0.0126
## INFO [23:01:47.531] epoch 31, loss 0.0123
## INFO [23:01:52.587] epoch 32, loss 0.0121
## INFO [23:01:57.714] epoch 33, loss 0.0119
## INFO [23:02:02.887] epoch 34, loss 0.0117
## INFO [23:02:08.100] epoch 35, loss 0.0115
## INFO [23:02:13.977] epoch 36, loss 0.0113
## INFO [23:02:20.129] epoch 37, loss 0.0111
## INFO [23:02:25.929] epoch 38, loss 0.0110
## INFO [23:02:31.444] epoch 39, loss 0.0108
## INFO [23:02:37.064] epoch 40, loss 0.0106
## INFO [23:02:42.346] epoch 41, loss 0.0105
## INFO [23:02:47.690] epoch 42, loss 0.0103
## INFO [23:02:52.978] epoch 43, loss 0.0102
## INFO [23:02:58.300] epoch 44, loss 0.0101
## INFO [23:03:03.627] epoch 45, loss 0.0099
## INFO [23:03:08.920] epoch 46, loss 0.0098
## INFO [23:03:14.154] epoch 47, loss 0.0097
## INFO [23:03:19.520] epoch 48, loss 0.0095
## INFO [23:03:24.786] epoch 49, loss 0.0094
## INFO [23:03:30.117] epoch 50, loss 0.0093
## INFO [23:03:35.410] epoch 51, loss 0.0092
## INFO [23:03:40.730] epoch 52, loss 0.0091
## INFO [23:03:45.980] epoch 53, loss 0.0090
## INFO [23:03:51.276] epoch 54, loss 0.0089
## INFO [23:03:56.615] epoch 55, loss 0.0088
## INFO [23:04:01.897] epoch 56, loss 0.0087
## INFO [23:04:07.110] epoch 57, loss 0.0086
## INFO [23:04:12.381] epoch 58, loss 0.0085
## INFO [23:04:17.683] epoch 59, loss 0.0084
## INFO [23:04:22.936] epoch 60, loss 0.0083
## INFO [23:04:28.223] epoch 61, loss 0.0082
## INFO [23:04:33.489] epoch 62, loss 0.0082
## INFO [23:04:38.798] epoch 63, loss 0.0081
## INFO [23:04:44.045] epoch 64, loss 0.0080
## INFO [23:04:49.343] epoch 65, loss 0.0079
## INFO [23:04:54.678] epoch 66, loss 0.0079
## INFO [23:04:59.944] epoch 67, loss 0.0078
## INFO [23:05:05.236] epoch 68, loss 0.0077
## INFO [23:05:10.539] epoch 69, loss 0.0076
## INFO [23:05:15.894] epoch 70, loss 0.0076
## INFO [23:05:21.239] epoch 71, loss 0.0075
## INFO [23:05:26.531] epoch 72, loss 0.0074
## INFO [23:05:31.797] epoch 73, loss 0.0074
## INFO [23:05:37.070] epoch 74, loss 0.0073
## INFO [23:05:42.437] epoch 75, loss 0.0072
## INFO [23:05:47.762] epoch 76, loss 0.0072
## INFO [23:05:53.132] epoch 77, loss 0.0071
## INFO [23:05:58.380] epoch 78, loss 0.0071
## INFO [23:06:03.864] epoch 79, loss 0.0070
## INFO [23:06:09.148] epoch 80, loss 0.0069
## INFO [23:06:14.413] epoch 81, loss 0.0069
## INFO [23:06:19.666] epoch 82, loss 0.0068
## INFO [23:06:24.920] epoch 83, loss 0.0068
## INFO [23:06:30.203] epoch 84, loss 0.0067
## INFO [23:06:35.503] epoch 85, loss 0.0067
## INFO [23:06:40.847] epoch 86, loss 0.0066
## INFO [23:06:46.104] epoch 87, loss 0.0066
## INFO [23:06:51.417] epoch 88, loss 0.0065
## INFO [23:06:56.717] epoch 89, loss 0.0065
## INFO [23:07:02.023] epoch 90, loss 0.0064
## INFO [23:07:07.294] epoch 91, loss 0.0064
## INFO [23:07:12.598] epoch 92, loss 0.0063
## INFO [23:07:17.858] epoch 93, loss 0.0063
## INFO [23:07:23.141] epoch 94, loss 0.0062
## INFO [23:07:28.383] epoch 95, loss 0.0062
## INFO [23:07:33.721] epoch 96, loss 0.0062
## INFO [23:07:39.031] epoch 97, loss 0.0061
## INFO [23:07:44.354] epoch 98, loss 0.0061
## INFO [23:07:49.639] epoch 99, loss 0.0060
## INFO [23:07:54.896] epoch 100, loss 0.0060
word_vectors_context_conser_post <- glove_conser_post$components
glove_embedding_conser_post <- word_vectors_main_conser_post + t(word_vectors_context_conser_post)
saveRDS(glove_embedding_conser_post, file = "local_glove_conser_post.rds")
Modelling takes time. So I will use the prepared model made by same process for knitting.
url_conser_post <- "https://github.com/RiverKim-garam/CTA24-Final-assessment/blob/main/local_glove_conser_post.rds?raw=true"
golve_embedding_conser_post <- readRDS(url(url_conser_post, method = "libcurl"))
Total GloVe word embedding two dimensional umap
# Plotting the whole word embeddings of post-crisis Conservative party immigration related text
umap_conser_post <- umap(glove_embedding_conser_post, n_components = 2, metric = "cosine", n_neighbors = 25, min_dist = 0.1, spread = 2)
df_umap_conser_post <- as.data.frame(umap_conser_post[["layout"]])
df_umap_conser_post$word <- rownames(df_umap_conser_post)
colnames(df_umap_conser_post) <- c("Post_Cons1", "Post_Cons2", "word")
ggplot(df_umap_conser_post) +
geom_point(aes(x = Post_Cons1, y = Post_Cons2), color = 'blue', size = 0.05) +
labs(title = "Conservative Party (Post-Crisis): Word Embeddings of GloVe and UMAP") +
theme_minimal()
Post-crisis Conservative party word embedding of words related to immigration
# Plot the word embedding of words that are related for the GloVe model (Case1: immigration)
word_conser_post_1 <- glove_embedding_conser_post["immigration",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_post, y = word_conser_post_1, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_post_1 <- df_umap_conser_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_post_1, aes(x = Post_Cons1, y = Post_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Cons1, Post_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Post-Crisis) word embedding of 'immigration'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Post-crisis Conservative party word embedding of words related to immigrants
# Plot the word embedding of words that are related for the GloVe model (case2: immigrants)
word_conser_post_2 <- glove_embedding_conser_post["immigrants",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_post, y = word_conser_post_2, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_post_2 <- df_umap_conser_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_post_2, aes(x = Post_Cons1, y = Post_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Cons1, Post_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Post-Crisis) word embedding of 'immigrants'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Post-crisis Conservative party word embedding of words related to asylums
# Plot the word embedding of words that are related for the GloVe model (case3: asylums)
word_conser_post_3 <- glove_embedding_conser_post["asylums",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_post, y = word_conser_post_3, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_post_3 <- df_umap_conser_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_post_3, aes(x = Post_Cons1, y = Post_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Cons1, Post_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Post-Crisis) word embedding of 'asylums'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Post-crisis Conservative party word embedding of words related to asylum
# Plot the word embedding of words that are related for the GloVe model (Case4: asylum)
word_conser_post_4 <- glove_embedding_conser_post["asylum",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_conser_post, y = word_conser_post_4, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_conser_post_4 <- df_umap_conser_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_conser_post_4, aes(x = Post_Cons1, y = Post_Cons2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Cons1, Post_Cons2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Conservative party (Post-Crisis) word embedding of 'asylum'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Training process
set.seed(42L)
glove_text_labour_pre <- sample(data_labour_pre$desc)
tokens_labour_pre <- space_tokenizer(glove_text_labour_pre)
it_labour_pre <- itoken(tokens_labour_pre, progressbar = FALSE)
vocab_labour_pre <- create_vocabulary(it_labour_pre)
vocab_pruned_labour_pre <- prune_vocabulary(vocab_labour_pre, term_count_min = COUNT_MIN)
vectorizer_labour_pre <- vocab_vectorizer(vocab_pruned_labour_pre)
tcm_labour_pre <- create_tcm(it_labour_pre, vectorizer_labour_pre, skip_grams_window = WINDOW_SIZE, skip_grams_window_context = "symmetric", weights = rep(1, WINDOW_SIZE))
glove_labour_pre <- GlobalVectors$new(rank = DIM, x_max = 100, learning_rate = 0.05)
word_vectors_main_labour_pre <- glove_labour_pre$fit_transform(tcm_labour_pre, n_iter = ITERS, convergence_tol = 0.001, n_threads = RcppParallel::defaultNumThreads())
## INFO [23:09:15.413] epoch 1, loss 0.1892
## INFO [23:09:18.858] epoch 2, loss 0.0795
## INFO [23:09:22.239] epoch 3, loss 0.0570
## INFO [23:09:25.690] epoch 4, loss 0.0458
## INFO [23:09:29.097] epoch 5, loss 0.0387
## INFO [23:09:32.638] epoch 6, loss 0.0339
## INFO [23:09:36.508] epoch 7, loss 0.0302
## INFO [23:09:40.327] epoch 8, loss 0.0274
## INFO [23:09:44.190] epoch 9, loss 0.0252
## INFO [23:09:47.907] epoch 10, loss 0.0234
## INFO [23:09:51.440] epoch 11, loss 0.0219
## INFO [23:09:54.828] epoch 12, loss 0.0206
## INFO [23:09:58.128] epoch 13, loss 0.0195
## INFO [23:10:01.458] epoch 14, loss 0.0185
## INFO [23:10:04.784] epoch 15, loss 0.0176
## INFO [23:10:08.740] epoch 16, loss 0.0169
## INFO [23:10:12.354] epoch 17, loss 0.0162
## INFO [23:10:16.037] epoch 18, loss 0.0156
## INFO [23:10:19.566] epoch 19, loss 0.0151
## INFO [23:10:23.350] epoch 20, loss 0.0145
## INFO [23:10:26.778] epoch 21, loss 0.0141
## INFO [23:10:30.342] epoch 22, loss 0.0137
## INFO [23:10:33.750] epoch 23, loss 0.0133
## INFO [23:10:37.203] epoch 24, loss 0.0129
## INFO [23:10:40.669] epoch 25, loss 0.0126
## INFO [23:10:44.125] epoch 26, loss 0.0123
## INFO [23:10:47.529] epoch 27, loss 0.0120
## INFO [23:10:50.961] epoch 28, loss 0.0117
## INFO [23:10:54.393] epoch 29, loss 0.0114
## INFO [23:10:57.879] epoch 30, loss 0.0112
## INFO [23:11:01.465] epoch 31, loss 0.0109
## INFO [23:11:04.971] epoch 32, loss 0.0107
## INFO [23:11:08.474] epoch 33, loss 0.0105
## INFO [23:11:11.950] epoch 34, loss 0.0103
## INFO [23:11:15.449] epoch 35, loss 0.0101
## INFO [23:11:18.898] epoch 36, loss 0.0099
## INFO [23:11:22.353] epoch 37, loss 0.0098
## INFO [23:11:25.752] epoch 38, loss 0.0096
## INFO [23:11:29.178] epoch 39, loss 0.0094
## INFO [23:11:32.676] epoch 40, loss 0.0093
## INFO [23:11:36.230] epoch 41, loss 0.0091
## INFO [23:11:39.684] epoch 42, loss 0.0090
## INFO [23:11:43.115] epoch 43, loss 0.0088
## INFO [23:11:46.594] epoch 44, loss 0.0087
## INFO [23:11:50.042] epoch 45, loss 0.0086
## INFO [23:11:53.462] epoch 46, loss 0.0085
## INFO [23:11:56.889] epoch 47, loss 0.0083
## INFO [23:12:00.465] epoch 48, loss 0.0082
## INFO [23:12:03.912] epoch 49, loss 0.0081
## INFO [23:12:07.378] epoch 50, loss 0.0080
## INFO [23:12:10.857] epoch 51, loss 0.0079
## INFO [23:12:14.313] epoch 52, loss 0.0078
## INFO [23:12:17.773] epoch 53, loss 0.0077
## INFO [23:12:21.233] epoch 54, loss 0.0076
## INFO [23:12:24.718] epoch 55, loss 0.0075
## INFO [23:12:28.176] epoch 56, loss 0.0074
## INFO [23:12:31.638] epoch 57, loss 0.0073
## INFO [23:12:35.083] epoch 58, loss 0.0072
## INFO [23:12:38.518] epoch 59, loss 0.0071
## INFO [23:12:41.968] epoch 60, loss 0.0071
## INFO [23:12:45.429] epoch 61, loss 0.0070
## INFO [23:12:48.877] epoch 62, loss 0.0069
## INFO [23:12:52.296] epoch 63, loss 0.0068
## INFO [23:12:55.862] epoch 64, loss 0.0067
## INFO [23:12:59.356] epoch 65, loss 0.0067
## INFO [23:13:02.870] epoch 66, loss 0.0066
## INFO [23:13:06.476] epoch 67, loss 0.0065
## INFO [23:13:10.229] epoch 68, loss 0.0065
## INFO [23:13:14.556] epoch 69, loss 0.0064
## INFO [23:13:17.982] epoch 70, loss 0.0063
## INFO [23:13:21.534] epoch 71, loss 0.0063
## INFO [23:13:25.000] epoch 72, loss 0.0062
## INFO [23:13:28.721] epoch 73, loss 0.0061
## INFO [23:13:32.234] epoch 74, loss 0.0061
## INFO [23:13:35.717] epoch 75, loss 0.0060
## INFO [23:13:39.265] epoch 76, loss 0.0060
## INFO [23:13:42.780] epoch 77, loss 0.0059
## INFO [23:13:46.286] epoch 78, loss 0.0059
## INFO [23:13:49.787] epoch 79, loss 0.0058
## INFO [23:13:53.442] epoch 80, loss 0.0057
## INFO [23:13:57.300] epoch 81, loss 0.0057
## INFO [23:14:00.917] epoch 82, loss 0.0056
## INFO [23:14:04.506] epoch 83, loss 0.0056
## INFO [23:14:08.091] epoch 84, loss 0.0055
## INFO [23:14:11.603] epoch 85, loss 0.0055
## INFO [23:14:15.109] epoch 86, loss 0.0054
## INFO [23:14:18.546] epoch 87, loss 0.0054
## INFO [23:14:22.150] epoch 88, loss 0.0054
## INFO [23:14:25.618] epoch 89, loss 0.0053
## INFO [23:14:29.067] epoch 90, loss 0.0053
## INFO [23:14:32.569] epoch 91, loss 0.0052
## INFO [23:14:36.053] epoch 92, loss 0.0052
## INFO [23:14:39.567] epoch 93, loss 0.0051
## INFO [23:14:43.044] epoch 94, loss 0.0051
## INFO [23:14:46.559] epoch 95, loss 0.0051
## INFO [23:14:49.994] epoch 96, loss 0.0050
## INFO [23:14:53.478] epoch 97, loss 0.0050
## INFO [23:14:57.057] epoch 98, loss 0.0049
## INFO [23:15:00.615] epoch 99, loss 0.0049
## INFO [23:15:04.081] epoch 100, loss 0.0049
word_vectors_context_labour_pre <- glove_labour_pre$components
glove_embedding_labour_pre <- word_vectors_main_labour_pre + t(word_vectors_context_labour_pre)
saveRDS(glove_embedding_labour_pre, file = "local_glove_labour_pre.rds")
Modelling takes time. So I will use the prepared model made by same process for knitting.
url_labour_pre <- "https://github.com/RiverKim-garam/CTA24-Final-assessment/blob/main/local_glove_labour_pre.rds?raw=true"
golve_embedding_labour_pre <- readRDS(url(url_labour_pre, method = "libcurl"))
Total GloVe word embedding two dimensional umap
# Plotting the whole word embeddings of pre-crisis Labour party immigration related text
umap_labour_pre <- umap(glove_embedding_labour_pre, n_components = 2, metric = "cosine", n_neighbors = 25, min_dist = 0.1, spread = 2)
df_umap_labour_pre <- as.data.frame(umap_labour_pre[["layout"]])
df_umap_labour_pre$word <- rownames(df_umap_labour_pre)
colnames(df_umap_labour_pre) <- c("Pre_Lab1", "Pre_Lab2", "word")
ggplot(df_umap_labour_pre) +
geom_point(aes(x = Pre_Lab1, y = Pre_Lab2), color = 'blue', size = 0.05) +
labs(title = "Labour Party (Pre-Crisis): Word Embeddings of GloVe and UMAP") +
theme_minimal()
Pre-crisis Labour party word embedding of words related to immigration
# Plot the word embedding of words that are related for the GloVe model (case1: immigration)
word_labour_pre_1 <- glove_embedding_labour_pre["immigration",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_labour_pre, y = word_labour_pre_1, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_labour_pre_1 <- df_umap_labour_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_labour_pre_1, aes(x = Pre_Lab1, y = Pre_Lab2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Lab1, Pre_Lab2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Labour party (Pre-Crisis) word embedding of 'immigration'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Labour party word embedding of words related to immigrants
# Plot the word embedding of words that are related for the GloVe model (Case2: immigrants)
word_labour_pre_2 <- glove_embedding_labour_pre["immigrants",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_labour_pre, y = word_labour_pre_2, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_labour_pre_2 <- df_umap_labour_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_labour_pre_2, aes(x = Pre_Lab1, y = Pre_Lab2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Lab1, Pre_Lab2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Labour party (Pre-Crisis) word embedding of 'immigrants'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Labour party word embedding of words related to asylum
# Plot the word embedding of words that are related for the GloVe model (case3: asylum)
word_labour_pre_3 <- glove_embedding_labour_pre["asylum",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_labour_pre, y = word_labour_pre_3, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_labour_pre_3 <- df_umap_labour_pre %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_labour_pre_3, aes(x = Pre_Lab1, y = Pre_Lab2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Pre_Lab1, Pre_Lab2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Labour party (Pre-Crisis) word embedding of 'asylum'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Training process
set.seed(42L)
glove_text_labour_post <- sample(data_labour_post$desc)
tokens_labour_post <- space_tokenizer(glove_text_labour_post)
it_labour_post <- itoken(tokens_labour_post, progressbar = FALSE)
vocab_labour_post <- create_vocabulary(it_labour_post)
vocab_pruned_labour_post <- prune_vocabulary(vocab_labour_post, term_count_min = COUNT_MIN)
vectorizer_labour_post <- vocab_vectorizer(vocab_pruned_labour_post)
tcm_labour_post <- create_tcm(it_labour_post, vectorizer_labour_post, skip_grams_window = WINDOW_SIZE, skip_grams_window_context = "symmetric", weights = rep(1, WINDOW_SIZE))
glove_labour_post <- GlobalVectors$new(rank = DIM, x_max = 100, learning_rate = 0.05)
word_vectors_main_labour_post <- glove_labour_post$fit_transform(tcm_labour_post, n_iter = ITERS, convergence_tol = 0.001, n_threads = RcppParallel::defaultNumThreads())
## INFO [23:16:12.158] epoch 1, loss 0.1722
## INFO [23:16:16.939] epoch 2, loss 0.0736
## INFO [23:16:21.657] epoch 3, loss 0.0534
## INFO [23:16:26.435] epoch 4, loss 0.0433
## INFO [23:16:31.177] epoch 5, loss 0.0369
## INFO [23:16:36.012] epoch 6, loss 0.0326
## INFO [23:16:41.080] epoch 7, loss 0.0293
## INFO [23:16:47.444] epoch 8, loss 0.0268
## INFO [23:16:54.615] epoch 9, loss 0.0248
## INFO [23:17:00.502] epoch 10, loss 0.0231
## INFO [23:17:05.492] epoch 11, loss 0.0217
## INFO [23:17:10.443] epoch 12, loss 0.0205
## INFO [23:17:15.260] epoch 13, loss 0.0195
## INFO [23:17:20.019] epoch 14, loss 0.0186
## INFO [23:17:24.734] epoch 15, loss 0.0179
## INFO [23:17:29.575] epoch 16, loss 0.0172
## INFO [23:17:34.344] epoch 17, loss 0.0165
## INFO [23:17:39.211] epoch 18, loss 0.0160
## INFO [23:17:44.049] epoch 19, loss 0.0155
## INFO [23:17:48.842] epoch 20, loss 0.0150
## INFO [23:17:53.613] epoch 21, loss 0.0146
## INFO [23:17:58.663] epoch 22, loss 0.0142
## INFO [23:18:03.867] epoch 23, loss 0.0138
## INFO [23:18:08.705] epoch 24, loss 0.0135
## INFO [23:18:14.131] epoch 25, loss 0.0131
## INFO [23:18:18.893] epoch 26, loss 0.0128
## INFO [23:18:23.617] epoch 27, loss 0.0126
## INFO [23:18:28.388] epoch 28, loss 0.0123
## INFO [23:18:33.455] epoch 29, loss 0.0121
## INFO [23:18:38.314] epoch 30, loss 0.0118
## INFO [23:18:43.074] epoch 31, loss 0.0116
## INFO [23:18:47.823] epoch 32, loss 0.0114
## INFO [23:18:52.588] epoch 33, loss 0.0112
## INFO [23:18:57.351] epoch 34, loss 0.0110
## INFO [23:19:02.673] epoch 35, loss 0.0108
## INFO [23:19:07.470] epoch 36, loss 0.0106
## INFO [23:19:12.222] epoch 37, loss 0.0104
## INFO [23:19:16.945] epoch 38, loss 0.0103
## INFO [23:19:21.927] epoch 39, loss 0.0101
## INFO [23:19:26.698] epoch 40, loss 0.0100
## INFO [23:19:31.515] epoch 41, loss 0.0098
## INFO [23:19:36.275] epoch 42, loss 0.0097
## INFO [23:19:40.973] epoch 43, loss 0.0095
## INFO [23:19:45.728] epoch 44, loss 0.0094
## INFO [23:19:50.474] epoch 45, loss 0.0093
## INFO [23:19:55.249] epoch 46, loss 0.0092
## INFO [23:20:00.057] epoch 47, loss 0.0090
## INFO [23:20:04.822] epoch 48, loss 0.0089
## INFO [23:20:09.586] epoch 49, loss 0.0088
## INFO [23:20:14.544] epoch 50, loss 0.0087
## INFO [23:20:19.351] epoch 51, loss 0.0086
## INFO [23:20:24.204] epoch 52, loss 0.0085
## INFO [23:20:28.974] epoch 53, loss 0.0084
## INFO [23:20:33.687] epoch 54, loss 0.0083
## INFO [23:20:38.475] epoch 55, loss 0.0082
## INFO [23:20:43.299] epoch 56, loss 0.0081
## INFO [23:20:48.031] epoch 57, loss 0.0080
## INFO [23:20:52.892] epoch 58, loss 0.0079
## INFO [23:20:57.649] epoch 59, loss 0.0078
## INFO [23:21:02.444] epoch 60, loss 0.0078
## INFO [23:21:07.369] epoch 61, loss 0.0077
## INFO [23:21:12.166] epoch 62, loss 0.0076
## INFO [23:21:16.924] epoch 63, loss 0.0075
## INFO [23:21:21.751] epoch 64, loss 0.0074
## INFO [23:21:26.507] epoch 65, loss 0.0074
## INFO [23:21:31.308] epoch 66, loss 0.0073
## INFO [23:21:36.145] epoch 67, loss 0.0072
## INFO [23:21:40.876] epoch 68, loss 0.0072
## INFO [23:21:45.691] epoch 69, loss 0.0071
## INFO [23:21:50.577] epoch 70, loss 0.0070
## INFO [23:21:55.651] epoch 71, loss 0.0070
## INFO [23:22:00.643] epoch 72, loss 0.0069
## INFO [23:22:05.406] epoch 73, loss 0.0068
## INFO [23:22:10.186] epoch 74, loss 0.0068
## INFO [23:22:14.979] epoch 75, loss 0.0067
## INFO [23:22:19.833] epoch 76, loss 0.0067
## INFO [23:22:24.767] epoch 77, loss 0.0066
## INFO [23:22:29.731] epoch 78, loss 0.0065
## INFO [23:22:34.584] epoch 79, loss 0.0065
## INFO [23:22:39.466] epoch 80, loss 0.0064
## INFO [23:22:44.224] epoch 81, loss 0.0064
## INFO [23:22:48.922] epoch 82, loss 0.0063
## INFO [23:22:53.737] epoch 83, loss 0.0063
## INFO [23:22:58.506] epoch 84, loss 0.0062
## INFO [23:23:03.550] epoch 85, loss 0.0062
## INFO [23:23:08.728] epoch 86, loss 0.0061
## INFO [23:23:14.471] epoch 87, loss 0.0061
## INFO [23:23:19.789] epoch 88, loss 0.0060
## INFO [23:23:25.579] epoch 89, loss 0.0060
## INFO [23:23:30.844] epoch 90, loss 0.0059
## INFO [23:23:36.073] epoch 91, loss 0.0059
## INFO [23:23:41.414] epoch 92, loss 0.0058
## INFO [23:23:47.902] epoch 93, loss 0.0058
## INFO [23:23:52.885] epoch 94, loss 0.0058
## INFO [23:23:58.018] epoch 95, loss 0.0057
## INFO [23:24:02.828] epoch 96, loss 0.0057
## INFO [23:24:07.570] epoch 97, loss 0.0056
## INFO [23:24:12.462] epoch 98, loss 0.0056
## INFO [23:24:17.176] epoch 99, loss 0.0055
## INFO [23:24:21.779] epoch 100, loss 0.0055
word_vectors_context_labour_post <- glove_labour_post$components
glove_embedding_labour_post <- word_vectors_main_labour_post + t(word_vectors_context_labour_post)
saveRDS(glove_embedding_labour_post, file = "local_glove_labour_post.rds")
Modelling takes time. So I will use the prepared model made by same process for knitting.
url_labour_post <- "https://github.com/RiverKim-garam/CTA24-Final-assessment/blob/main/local_glove_labour_post.rds?raw=true"
golve_embedding_labour_post <- readRDS(url(url_labour_post, method = "libcurl"))
Total GloVe word embedding two dimensional umap
# Plotting the whole word embeddings of post-crisis Labour party immigration related text
umap_labour_post <- umap(glove_embedding_labour_post, n_components = 2, metric = "cosine", n_neighbors = 25, min_dist = 0.1, spread = 2)
df_umap_labour_post <- as.data.frame(umap_labour_post[["layout"]])
df_umap_labour_post$word <- rownames(df_umap_labour_post)
colnames(df_umap_labour_post) <- c("Post_Lab1", "Post_Lab2", "word")
ggplot(df_umap_labour_post) +
geom_point(aes(x = Post_Lab1, y = Post_Lab2), color = 'blue', size = 0.05) +
labs(title = "Labour Party (Post-Crisis): Word Embeddings via GloVe and UMAP") +
theme_minimal()
Post-crisis Labour party word embedding of words related to immigration
# Plot the word embedding of words that are related for the GloVe model (case1: immigration)
word_labour_post_1 <- glove_embedding_labour_post["immigration",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_labour_post, y = word_labour_post_1, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_labour_post_1 <- df_umap_labour_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_labour_post_1, aes(x = Post_Lab1, y = Post_Lab2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Lab1, Post_Lab2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Labour party (Post-Crisis) word embedding of 'immigration'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Labour party word embedding of words related to immigrants
# Plot the word embedding of words that are related for the GloVe model (case2: immigrants)
word_labour_post_2 <- glove_embedding_labour_post["immigrants",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_labour_post, y = word_labour_post_2, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_labour_post_2 <- df_umap_labour_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_labour_post_2, aes(x = Post_Lab1, y = Post_Lab2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Lab1, Post_Lab2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Labour party (Post-Crisis) word embedding of 'immigrants'") +
theme(plot.title = element_text(hjust = .5, size = 14))
Pre-crisis Labour party word embedding of words related to asylum
# Plot the word embedding of words that are related for the GloVe model (case3: asylum)
word_labour_post_3 <- glove_embedding_labour_post["asylum",, drop = FALSE]
cos_sim = sim2(x = glove_embedding_labour_post, y = word_labour_post_3, method = "cosine", norm = "l2")
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words_labour_post_3 <- df_umap_labour_post %>%
inner_join(y=select, by= "word")
#The ggplot visual for GloVe
ggplot(selected_words_labour_post_3, aes(x = Post_Lab1, y = Post_Lab2)) +
geom_point(show.legend = FALSE) +
geom_text(aes(Post_Lab1, Post_Lab2, label = word), show.legend = FALSE, size = 2.5, vjust=-1.5, hjust=0) +
labs(title = "Labour party (Post-Crisis) word embedding of 'asylum'") +
theme(plot.title = element_text(hjust = .5, size = 14))